packages <- function(x){
x <- as.character(match.call()[[2]])
if (!require(x,character.only=TRUE)){
install.packages(pkgs=x,repos="http://cran.r-project.org")
require(x,character.only=TRUE)
}
}
packages(ggplot2)
packages(dplyr)
packages(googleVis)
packages(reshape)
packages(plotly)
packages(tm)
packages(RColorBrewer)
packages(wordcloud)
packages(RCurl)
op <- options(gvis.plot.tag='chart')
loan <- read.csv("/Users/catherinecao/Documents/lending_club_project/loan.csv")
loan$issue_d <- as.Date(gsub("^", "01-", loan$issue_d), format="%d-%b-%Y")
loan$Year <- format(loan$issue_d, "%Y")
amnt_df <- loan %>% select(issue_d, loan_amnt) %>% group_by(issue_d) %>% summarise(Amount = sum(loan_amnt),
Volume = n())
Line <- gvisLineChart(amnt_df, "issue_d", "Amount", options = list(legend = "none",
title = "Loan Amount Issued by Month", hAxis = "{title:'Date Issued'}",
vAxis = "{title:'Amount($)'}", tag = "chart"))
plot(Line)
Line2 <- gvisLineChart(amnt_df, "issue_d", "Volume", options = list(legend = "none",
title = "Loan Volume Issued by Month", hAxis = "{title:'Date Issued'}",
vAxis = "{title:'Volume'}"))
plot(Line2)
# average loan
plot_avg_loan <- loan %>% select(Year, loan_amnt) %>% group_by(Year) %>% summarise(avg_loan = mean(loan_amnt))
Line_avg <- gvisLineChart(plot_avg_loan, "Year", "avg_loan", options = list(legend = "none",
title = "Average Loan Size by Year", hAxis = "{title:'Year Issued'}", vAxis = "{title:'Average Loan Size($)'}"))
plot(Line_avg)
Starting from 2012, Lending Club grows siginificantly.
# grade, overall
plot_grade_pie <- as.data.frame(table(loan$grade))
grade_pie <- gvisPieChart(plot_grade_pie)
plot(grade_pie)
p_grade_interest <- plot_ly(loan, y = ~int_rate, color = ~grade, type = "box")
p_grade_interest
plot_grade <- loan %>% select(Year, loan_amnt, grade) %>% group_by(Year, grade) %>%
summarise(total = n())
reshaped <- cast(plot_grade, Year ~ grade)
SteppedArea <- gvisSteppedAreaChart(reshaped, xvar = "Year", yvar = c("A", "B",
"C", "D", "E", "F", "G"), options = list(isStacked = "percent"))
plot(SteppedArea)
plot_status <- as.data.frame(table(loan$loan_status))
Pie <- gvisPieChart(plot_status)
plot(Pie)
p_status <- plot_ly(loan, y = ~loan_amnt, color = ~loan_status, type = "box")
p_status
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
state_by_value <- loan %>% group_by(addr_state) %>% summarise(value = sum(loan_amnt,
na.rm = TRUE))
GeoStates <- gvisGeoChart(state_by_value, "addr_state", "value", options = list(region = "US",
displayMode = "regions", resolution = "provinces", width = 600, height = 400))
plot(GeoStates)
# default rate * state default itself is quite small so use the broad
# defination 'bad' statuses
bad_indicators <- c("Charged Off ", "Default", "Does not meet the credit policy. Status:Charged Off",
"In Grace Period", "Default Receiver", "Late (16-30 days)", "Late (31-120 days)")
# assign certain statuses to a 'bad' ('0') group
loan$is_bad <- ifelse(loan$loan_status %in% bad_indicators, 0, ifelse(loan$loan_status ==
"", NA, 1))
default_rate <- loan %>% group_by(addr_state) %>% summarise(countn = n(), sumn = sum(is_bad),
default_rate = (1 - sumn/countn) * 100)
GeoStates_defult <- gvisGeoChart(default_rate, "addr_state", "default_rate",
options = list(region = "US", displayMode = "regions", resolution = "provinces",
width = 600, height = 400))
plot(GeoStates_defult)
plot_purpose <- as.data.frame(table(loan$purpose))
Pie_purpose <- gvisPieChart(plot_purpose)
plot(Pie_purpose)
# Word Cloud
loan_title_corpus <- Corpus(DataframeSource(data.frame(head(loan[, 22], n = 10000))))
loan_title_corpus <- tm_map(loan_title_corpus, removePunctuation)
loan_title_corpus <- tm_map(loan_title_corpus, content_transformer(tolower))
set.seed(123)
wordcloud(loan_title_corpus, max.words = 100, random.order = FALSE, rot.per = 0.3,
use.r.layout = FALSE, colors = brewer.pal(8, "Paired"))
loan_emp_corpus <- Corpus(DataframeSource(data.frame(head(loan[, 11], n = 10000))))
loan_emp_corpus <- tm_map(loan_emp_corpus, removePunctuation)
loan_emp_corpus <- tm_map(loan_emp_corpus, content_transformer(tolower))
loan_emp_corpus <- tm_map(loan_emp_corpus, removeWords, c("inc", "group", "corporation",
"llc", "company", "and", "corp", "institute"))
set.seed(124)
wordcloud(loan_emp_corpus, scale = c(2, 0.2), max.words = 100, random.order = FALSE,
rot.per = 0.3, use.r.layout = FALSE, colors = brewer.pal(8, "Paired"))
plot_emp_pie <- as.data.frame(table(loan$emp_length))
emp_pie <- gvisPieChart(plot_emp_pie)
plot(emp_pie)
plot_emp_len <- loan %>% select(emp_length, is_bad, loan_amnt) %>% group_by(emp_length,
is_bad) %>% summarise(countn = n())
reshaped_emp <- cast(plot_emp_len, is_bad ~ emp_length)
SteppedArea_emp <- gvisColumnChart(reshaped_emp, xvar = "is_bad", yvar = names(reshaped_emp),
options = list(isStacked = "percent"))
plot(SteppedArea_emp)
plot(gvisPieChart(as.data.frame(table(loan$home_ownership))))
plot_home <- loan %>%
select(home_ownership, is_bad, loan_amnt) %>%
group_by(home_ownership, is_bad) %>%
summarise(countn = n())
reshaped_home <- cast(plot_home, is_bad ~ home_ownership)
reshaped_home <- reshaped_home[c(1,3:7)]
SteppedArea_home <- gvisColumnChart(reshaped_home, xvar="is_bad",
yvar= c("MORTGAGE", "NONE", "OTHER","OWN", "RENT"),
options=list(isStacked='percent',
hAxes = "[{title:'Default Status'}"))
plot(SteppedArea_home)
## Set options back to original options
options(op)
Reference: https://www.kaggle.com/erykwalczak/d/wendykan/lending-club-loan-data/initial-loan-book-analysis